# Basic packages
library(tidyverse)
library(ggthemes)
# Data Package
library(gapminder)
# Specialty Graph Packages
library(ggridges)
library(cowplot)
library(GGally)
# Colorscale and adjustment packages 
library(viridis)
library(scales)
# Interactive packages
library(plotly)
# Forecasting packages
library(fable)
library(feasts)
library(tsibble)
library(tsbox)
library(Quandl)
library(dygraphs)
library(highcharter)
library(lubridate)
library(zoo)
# Qualitative packages
library(wordcloud2)
library(extrafont)
library(tidytext)
library(textdata)
library(sentimentr)

theme_new <-  theme_fivethirtyeight(base_size=12#,base_family="Open Sans"
                                    ) %+replace% theme(panel.grid.major.y = element_line(colour = "grey80", size = 0.25),  panel.grid.major.x = element_blank(), panel.background = element_rect(fill = "white"), plot.background = element_rect(fill = "white"), legend.background = element_rect(fill = "white")) 

Variation (1 Variable)

This section covers univariate graphs (one variable at a time.)

Cat

Bar chart

diamonds %>% 
        ggplot(aes(x = cut, fill = cut)) + 
        geom_bar() +
        labs(title = "Title", 
            subtitle = "Subtitle") +
        theme_new

Num

Histogram

diamonds %>% 
        ggplot(aes(x = price)) + geom_histogram(fill = "#afd7db", binwidth = 80) +
        labs(title = "Title", 
            subtitle = "Subtitle", 
            y = "Count", 
            x = "Variable") +
        theme_new #+ theme(axis.title = element_text(size = 10, color = "grey40"))

Boxplots

You could make a univariate boxplot… but why? This should probably be a bivariate plot most of the time.

diamonds %>% 
        ggplot(aes(x = price)) + 
        geom_boxplot(fill = "#afd7db") +
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "Variable") +
        theme_new #+ theme(axis.title = element_text(size = 10, color = "grey40"))

Combined Histogram & Boxplot

You can put any graphs together in a grid. Here is just one example. Note: if you are using a different theme, you may have to adjust the margins here.

plt1 <- diamonds %>% 
        ggplot(aes(x = price)) + 
        geom_boxplot(fill = "#afd7db", width = .1, size = .3, outlier.alpha = .01) +
        labs(title = "Title") + 
        theme_new + 
        theme(axis.text = element_blank(), 
                          panel.grid.major.y = element_blank(),
                          panel.grid.minor = element_blank(),
                          panel.background = element_blank(),
                          plot.subtitle = element_text(margin=margin(0,0,15,0)))

plt2 <- diamonds %>% 
        ggplot(aes(x = price)) +
        geom_histogram(fill = "#afd7db", binwidth = 30) + 
        theme_new %+replace% theme(
          plot.margin = margin(0,10,10,10)
        )

cowplot::plot_grid(plt1, plt2, 
                   ncol = 1, rel_heights = c(1, 2.5),
                   align = 'v', axis = 'lr')  

Covariation (2 Variables)

Cat/Cat

Bar graph: Stacked

Note: some people find it hard to compare areas in this visualization.

diamonds %>% 
        ggplot(aes(x = cut, fill = color)) + geom_bar() +
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "Variable") +
        theme_new #+ theme(axis.title = element_text(size = 10, color = "grey40"))

Bar graph: Dodged

diamonds %>% 
        ggplot(aes(x = cut, fill = color)) + geom_bar(position = "dodge") +
        labs(title = "Title", 
            subtitle = "Subtitle",
            x = "Variable") +
        theme_new #+ theme(axis.title = element_text(size = 10, color = "grey40"))

Bar graph: Proportional

diamonds %>% 
      ggplot(aes(x = clarity, fill = color)) + 
      geom_bar(position = "fill", width = 1) +
      labs(title = "Title", 
          subtitle = "Subtitle") +
      theme_new

Bar graph: Facetted

This graph is the easiest to read of the multivariate bar graphs. Note, if you want to switch the orientation, switch the order from “. ~ variable” to “variable ~ .” May be smart to adjust axis grids as well.

diamonds %>%  
        ggplot(aes(y = clarity, fill = clarity)) + 
        geom_bar() +
        labs(title = "Title", 
            subtitle = "Subtitle") + 
        facet_grid(. ~ color) + 
        theme_new + 
        theme(axis.text.x = element_text(angle = 90, size = 8))

Count Plots

diamonds %>%  
        ggplot(aes(x = color, y = clarity)) + 
        geom_count(color = "#afd7db") +
        labs(title = "Title", 
            subtitle = "Subtitle") + 
        theme_new 

Heat Map

diamonds %>% 
        count(cut, clarity) %>% 
        ggplot(aes(x = cut, y = clarity, fill = n)) +
        geom_tile()+
        scale_fill_distiller(palette = "GnBu") +
        labs(title = "Title", 
            subtitle = "Subtitle") + 
        theme_new +
        theme(legend.position = "right",
              legend.direction = "vertical", 
              panel.grid.major.y = element_blank()) 

Heat Map with Labels

diamonds %>% 
        count(cut, clarity) %>% 
        ggplot(aes(x = cut, y = clarity, fill = n)) +
        geom_tile()+
        geom_text(aes(label = n), color = "black") +
        scale_fill_distiller(palette = "GnBu") +
        labs(title = "Title", 
            subtitle = "Subtitle") + 
        theme_new +
        theme(legend.position = "none",
              panel.grid.major.y = element_blank()) 

Pyramid Chart (back to back bargraph)

One categorical variable needs to be binary for this plot. Since I do not have one in the diamonds dataset, I am artificially creating one.

# Formatting data
diamonds %>% filter(cut %in% c("Fair", "Good")) %>% 
    count(clarity, cut) %>% 
    mutate(positive_negative = ifelse(cut == "Fair", -1, 1) * n) %>% 
# Graphing
    ggplot(aes(y = clarity, x = positive_negative, fill = cut)) + 
    geom_col() + 
    scale_fill_manual(values = c("#afd7db", "#fce3bd")) +
    labs(title = "Title", 
          subtitle = "Subtitle") + 
    theme_new

Cat/Num

Freqpoly

I am not a big fan of the freqpoly for univariate analysis (density plots are much more attractive and bar charts are more common) but I think they do nicely for comparing the distributions of a categorical/num variable combo.

diamonds %>%  
        ggplot(aes(x = price, color = cut)) + 
        geom_freqpoly(size = 1, alpha = .6) +
        labs(title = "Title", 
            subtitle = "Subtitle") + 
        theme_new 

Density plot

This plot works better with 2-4 categories. When there are more categories than that, I would stagger or stack them, as seen below. You may decide whether to remove or keep the y axis labels.

diamonds %>%  
        ggplot(aes(x = price, fill = cut, color = cut)) + 
        geom_density(alpha = .3) +
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "X Variable") + 
        theme_new + 
        theme(legend.position = "right", 
              legend.direction = "vertical", 
              axis.title = element_text(size = 10, color = "grey40"), 
              axis.text.y = element_blank())

### Density plot: facetted

diamonds %>%  
        ggplot(aes(x = price, fill = cut, color = cut)) + 
        geom_density(alpha = .3) +
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "X Variable") + 
        facet_wrap(~cut) + 
        theme_new + 
        theme(legend.position = "right", 
              legend.direction = "vertical", 
              axis.title = element_text(size = 10, color = "grey40"), 
              axis.text.y = element_blank())

Density ridges: Colored by y

diamonds %>%  
        ggplot(aes(x = price, y = clarity, fill = clarity, color = clarity)) + 
        geom_density_ridges(alpha = .3) +
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "X Variable", 
            y = "Y Variable") + 
        theme_new + 
        theme(legend.position = "none",
              axis.title.x = element_text(size = 10, color = "grey40"))

Density ridges: Colored by x

diamonds %>%  
        ggplot(aes(x = price, y = clarity)) + 
        geom_density_ridges_gradient(aes(fill = ..x..)) +
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "X Variable", 
            y = "Y Variable") + 
        scale_fill_gradientn(
            colours = c("#0D0887FF", "#ff9900", "#ffffff"))+
        theme_new + 
        theme(legend.position = "none",
              axis.title.x = element_text(size = 10, color = "grey40"))

Boxplot

One to many Here is the real purpose of boxplots. Not a huge fan of this color scheme with this particular graph though. I would pick something else. Notice in this case I have reordered this by the median of price, which is a big no-no if you have an ordinal variable, but I wanted the code here to show how you would do that.

diamonds %>% 
        ggplot(aes(x = fct_reorder(cut, price, median, na.rm = TRUE), y = price, fill = cut)) + 
        geom_boxplot(outlier.alpha = .1) +
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "") +
        theme_new + 
        theme(axis.title = element_text(size = 10, color = "grey40"),
              axis.title.x = element_text(hjust = .02, vjust = .4),
              legend.position = "none")

Dotplot

One to many This looks like a rorschach ink blot but it is another way to show distribution.

diamonds %>% 
  filter(cut %in% c("Fair", "Good", "Very Good")) %>% # Not necessary for the graph. 
        ggplot(aes(x = cut, y = price, color = cut)) + 
        geom_dotplot(binaxis = "y", binwidth = 50, stackdir = "center") + # You'll have to change the binwidth for your particular dataset. 
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "") +
        theme_new + 
        theme(axis.title = element_text(size = 10, color = "grey40"),
              axis.title.x = element_text(hjust = .02, vjust = .4),
              legend.position = "none")

Boxplot with dotplot

diamonds %>% 
  filter(cut %in% c("Fair", "Good", "Very Good")) %>% # Not necessary for the graph. 
        ggplot(aes(x = fct_reorder(cut, price, median, na.rm = TRUE), y = price, fill = cut)) + 
        geom_boxplot(outlier.alpha = 0) +
        geom_dotplot(binaxis = "y", binwidth = 50, stackdir = "center", alpha = .1) + # You'll have to change the binwidth for your particular dataset. 
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "") +
        theme_new + 
        theme(axis.title = element_text(size = 10, color = "grey40"),
              axis.title.x = element_text(hjust = .02, vjust = .4),
              legend.position = "none")

Violin plot

This plot is reordered by the frequency of each factor.

diamonds %>% 
        ggplot(aes(x = fct_rev(fct_infreq(cut)), y = price)) + 
        geom_violin(alpha = .6, fill = "#afd7db", size = 0) +
        geom_boxplot(width = 0.1,
                     fill = "#ffffff",
                     alpha = 0.5, 
                     size = .2,
                     outlier.alpha = .01) +
        stat_summary(
          fun = "mean",
          geom = "point",
          shape = 23,
          size = 1,
          color = "red",
          fill = "red",
          stroke = 0.75, 
          alpha = .6) +
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "") +
        theme_new + 
        theme(axis.title = element_text(size = 10, color = "grey40"),
              axis.title.x = element_text(hjust = .02, vjust = .4),
              legend.position = "none")

Violin plot with scatter

This plot works better with smaller datasets (which is why I took a smaller sample of the data)

diamonds %>% 
  sample_n(1000) %>%  # This line is not needed for most graphs
        ggplot(aes(x = fct_rev(fct_infreq(cut)), y = price)) + 
        geom_violin(alpha = .6, fill = "#afd7db", size = 0) +
        geom_point(position = "jitter", alpha = 0.1, size = 1) +
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "") +
        theme_new + 
        theme(axis.title = element_text(size = 10, color = "grey40"),
              axis.title.x = element_text(hjust = .02, vjust = .4),
              legend.position = "none")

Column chart

One to one A col chart looks almost exactly like a bar graph, but it plots two variables instead of one, and you can choose the stat instead of the default “count”. This require a one to one relationship between variables like in the numeric col chart version. But unlike the num version, we can get around this by summarizing one of the categories by a stat. Here we are doing the mean, but we could do it by the median, standard deviation, min, max, or just about any other stat we can think of. Unless the cat variable is ordinal (like this one), I recommend you reorder it by the stat you are using. (The hashtagged line shows you how to do that.)

# This is not part of the graph in every case. (getting into one to one)
diamonds %>% 
      group_by(clarity) %>% 
      summarise(mean_price = mean(price)) %>%
      arrange(mean_price) %>% 
# This is part of the graph in every case. 
      ggplot(aes(x = clarity, y = mean_price, fill = clarity)) + 
      #ggplot(aes(x = fct_reorder(clarity, mean_price), y = mean_price, fill = clarity)) + 
      geom_col() +
      labs(title = "Title", 
          subtitle = "Subtitle",
          y = "Y Variable",
          x = "") +
      theme_new + theme(axis.title = element_text(size = 10, color = "grey40"))

Column chart (labelled)

A col chart looks almost exactly like a bar graph, but it plots two variables instead of one, and you can choose the stat instead of the default “count”. This require a one to one relationship between variables like in the numeric col chart version. But unlike the num version, we can get around this by summarizing one of the categories by a stat. Here we are doing the mean, but we could do it by the median, standard deviation, min, max, or just about any other stat we can think of. Unless the cat variable is ordinal (like this one), I recommend you reorder it by the stat you are using. (The hashtagged line shows you how to do that.)

# This is not part of the graph in every case. 
diamonds %>% 
      group_by(clarity) %>% 
      summarise(mean_price = mean(price)) %>%
      arrange(mean_price) %>% 
# This is part of the graph in every case. 
      ggplot(aes(x = clarity, y = mean_price, fill = clarity)) +
      geom_col() +
      geom_text(aes(label = paste0("$", round(mean_price))), vjust = 1.5, colour = "white") +
      labs(title = "Title", 
          subtitle = "Subtitle",
          y = "Y Variable",
          x = "") +
      theme_new + theme(axis.title = element_text(size = 10, color = "grey40"), legend.position = "none")

Column chart with error bars

# This is not part of the graph in every case. 
diamonds %>% 
      group_by(cut) %>% 
      summarise(mean_depth = mean(depth),
                sd_depth = sd(depth)) %>%
      arrange(mean_depth) %>% 
# This is part of the graph in every case. 
      ggplot(aes(x = cut, y = mean_depth, fill = cut)) + 
      geom_col() +
      geom_errorbar(aes(ymin = mean_depth - sd_depth, ymax = mean_depth + sd_depth), width = 0.1) +
      labs(title = "Title", 
          subtitle = "Subtitle",
          y = "Y Variable",
          x = "") +
      theme_new + theme(axis.title = element_text(size = 10, color = "grey40"))

Scatterplot with jitter

(Note the y axis has been changed with ylim. You may have to adjust or delete this setting.)

diamonds %>% 
        ggplot(aes(x = clarity, y = price)) + 
        geom_point(color = "#183054", alpha = .1, position = "jitter") +
        ylim(0, 20000) +
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "x variable", 
            y = "y variable") +
        theme_new + 
        theme(axis.title = element_text(size = 10, color = "grey40"))

Lollipop Chart

One to one relationship, meaning each category can only have one associated numerical value. (You can also achieve this by running a stat like in the cat/num column chart example). This is good for a cat variable with lots of categories.

gapminder %>% 
       filter(continent == "Americas", year == "2007") %>% 
        ggplot(aes(x = fct_reorder(country, gdpPercap),
                   y = gdpPercap)) +
              labs(title = "Title", 
                  subtitle = "Subtitle", 
                  x = "x variable", 
                  y = "y variable") +
         geom_segment(
           aes(x=fct_reorder(country, gdpPercap),
               xend=fct_reorder(country, gdpPercap), 
               y=0, 
               yend=gdpPercap), 
           color="grey") +
        geom_point(size = 3, color = "#afd7db") +
        coord_flip() + 
        theme_new + theme(panel.grid.major.y = element_blank())

Lollipop Chart with 2 cats

This is technically a Cat/Cat/Num plot, but I am putting it here with Lolli.

gapminder %>% 
        filter(continent == "Americas") %>% 
        pivot_wider(names_from = year, values_from = c(lifeExp, pop, gdpPercap)) %>% 
        ggplot() +
        geom_segment(aes(x=fct_reorder(country, gdpPercap_2007), xend=country, y=gdpPercap_1952, yend=gdpPercap_2007),color="grey") +
        geom_point(aes(x = country, y = gdpPercap_1952), size = 3, color = "#afd7db") +
        geom_point(aes(x = country, y = gdpPercap_2007), size = 3, color = "#f0b684") +
        labs(title = "Title", 
          subtitle = "Subtitle", 
          x = "x variable", 
          y = "y variable") +
        coord_flip() + 
        theme_new + theme(panel.grid.major.y = element_blank())

Histogram with Facet Grid

diamonds %>% ggplot(aes(price, fill = cut)) +
  geom_histogram(data = select(diamonds, -cut), fill = "grey", show.legend = FALSE) +
  geom_histogram(bins = 30) +
  scale_fill_viridis_d() +
  facet_wrap(~ cut) + 
  labs(title = "Title", 
       subtitle = "Subtitle") + 
  theme_new

Density with Facet Grid

diamonds %>% ggplot(aes(price, y = ..count..)) +
  geom_density(data = select(diamonds, -cut), fill = "grey",color = "grey", show.legend = FALSE) +
  geom_density(aes(fill = cut,  color = cut), show.legend = FALSE) +
  scale_fill_viridis_d(alpha = .5) +
  facet_wrap(~ cut) + 
  labs(title = "Title", 
       subtitle = "Subtitle") + 
  theme_new

Num/Num

Scatterplot with trendline (loess)

diamonds %>% 
        ggplot(aes(x = carat, y = price)) + 
        geom_point(color = "#183054", alpha = .1) + 
        stat_smooth(color = "#ffa781", alpha = .2, size = .7) +
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "x variable", 
            y = "y variable") +
        theme_new + 
        theme(axis.title = element_text(size = 10, color = "grey40"))

Scatterplot with trendline (linear)

(Note the y axis has been changed with ylim. You may have to adjust or delete this setting.)

diamonds %>% 
        ggplot(aes(x = carat, y = price)) + 
        geom_point(color = "#183054", alpha = .1) + 
        geom_smooth(method = lm, color = "#ffa781", size = .7) +
        ylim(0, 20000) +
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "x variable", 
            y = "y variable") +
        theme_new + 
        theme(axis.title = element_text(size = 10, color = "grey40"))

Scatterplot with contour lines

(Note the y axis has been changed with ylim. You may have to adjust or delete this setting.)

diamonds %>% 
        ggplot(aes(x = carat, y = price)) + 
        geom_point(color = "#183054", alpha = .1) + 
        geom_density2d(color = "#ffa781", alpha = .5) +
        ylim(0, 20000) +
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "x variable", 
            y = "y variable") +
        theme_new + 
        theme(axis.title = element_text(size = 10, color = "grey40"))

Scatterplot with count (size)

This is good for datasets with a lot of overlapping data. It is an alternative to jitter.

iris %>% 
        ggplot(aes(x = Petal.Length, y = Petal.Width)) + 
        geom_count(color = "#183054", alpha = .8) + 
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "x variable", 
            y = "y variable") +
        theme_new + 
        theme(axis.title = element_text(size = 10, color = "grey40"))

Scatterplot with count (colored)

Also an option for numeric data with a lot of overlapping data.

iris %>% 
    group_by(Petal.Length, Petal.Width) %>% 
    summarise(count = n()) %>%
        ggplot(aes(x = Petal.Length, y = Petal.Width, color = count)) + 
        geom_count(size = 5, alpha = .8) + 
        scale_color_viridis() +
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "x variable", 
            y = "y variable") +
        theme_new + 
        theme(axis.title = element_text(size = 10, color = "grey40"))

Numeric Boxplots

Another option for over-plotting.

gapminder %>% 
      ggplot(aes(x = year, y = lifeExp)) + 
      geom_boxplot(aes(group = year), width = 1, ) + 
      labs(title = "Title", 
          subtitle = "Subtitle",
          y = "Y Variable",
          x = "") +
      theme_new + theme(axis.title = element_text(size = 10, color = "grey40")) +
      scale_x_continuous(breaks=seq(1952, 2007, 5)) # To increase frequency of axis ticks

Two dimensional density (hex)

(Note the y axis has been changed with ylim. You may have to adjust or delete this setting.)

# Hexbin chart with default option
diamonds %>% 
      ggplot(aes(x = depth, y = price)) +
      geom_hex(bins = 70) +
      labs(title = "Title",
          subtitle = "Subtitle",
          x = "x variable",
          y = "y variable") +
      scale_fill_continuous(type = "viridis") +
      ylim(0,15000) + 
      theme_new 

Two dimensional density (polygon)

diamonds %>% 
      ggplot(aes(x = depth, y = price)) +
      stat_density_2d(aes(fill = ..level..), geom = "polygon") +
      scale_fill_viridis() +
      labs(title = "Title",
          subtitle = "Subtitle",
          x = "x variable",
          y = "y variable") +
      scale_fill_continuous(type = "viridis") +
      theme_new 

Two dimensional density (contour map)

diamonds %>% 
      ggplot(aes(x = depth, y = price)) +
      geom_density2d() +
      scale_fill_viridis() +
      labs(title = "Title",
          subtitle = "Subtitle",
          x = "x variable",
          y = "y variable") +
      scale_fill_continuous(type = "viridis") +
      theme_new 

Two dimensional density (rectangular heatmap)

diamonds %>% 
      ggplot(aes(x = depth, y = price)) +
      geom_bin2d(binwidth = c(1,1000)) +
      scale_fill_viridis() +
      labs(title = "Title",
          subtitle = "Subtitle",
          x = "x variable",
          y = "y variable") +
      scale_fill_continuous(type = "viridis") +
      theme_new 

Column Chart

You need a one to one relationship in your dataset to map this. For example, each numeric x only has one numeric y in your dataset.

# This is not part of the graph in every case 
gapminder %>% 
      filter(country == "Bangladesh") %>% 
# This is part of the graph in every case. 
      ggplot(aes(x = year, y = lifeExp)) + 
      geom_col(fill = "#afd7db") +
      labs(title = "Title", 
          subtitle = "Subtitle",
          y = "Y Variable",
          x = "") +
      theme_new + theme(axis.title = element_text(size = 10, color = "grey40")) +
      scale_x_continuous(breaks=seq(1952, 2007, 5)) # To increase frequency of axis ticks

Column Chart with negative

You need a one to one relationship in your dataset to map this, as well as positive and negative data. For example, each numeric x only has one numeric y in your dataset.

# Preparing the data (not necessary for the graph)
economics_long %>% 
  filter(variable == "unemploy") %>% 
  mutate(value2 = value - mean(value), 
         pos = ifelse(value2 >= 0, TRUE, FALSE)) %>% 
# Creating the graph
  ggplot(aes(x = date, y = value2, fill = pos)) +
  geom_col(position = "identity") +    
  labs(title = "Title", 
          subtitle = "Subtitle",
          y = "Y Variable",
          x = "") +
  theme_new + 
  theme(legend.position = "none")

Covariation (3+ Variables)

Before you jump into visualizations with more than three variables, you should ask yourself: “Do I really need to put these together in the same visualization?” These graphs can get complex and overwhelming fast. Ask yourself whether your audience will really take the time to look at these graphs that you have created.

Any Variable Combination

GGally correlations (no fill)

This accepts any type of variable. Rescale it to see it better. This is just one of many types of graphs GGally can produce. See the documentation below for more graphs and examples.

https://ggobi.github.io/ggally/reference/ggpairs.html

diamonds %>% ggpairs(columns = 1:5) + theme_new

GGally correlations (filled by cat)

This accepts any type of variable. Rescale it to see it better. This is just one of many types of graphs GGally can produce. See the documentation below for more graphs and examples.

https://ggobi.github.io/ggally/reference/ggpairs.html

diamonds %>% ggpairs(columns = 1:5, aes(fill = cut)) + theme_new

Parallel Coordinate Plot

diamonds %>% 
  sample_n(50) %>%
    ggparcoord(
      columns = c(1,5:7), 
      groupColumn = 2, 
      showPoints = TRUE, 
      title = "Title",
      alphaLines = 0.3
      ) + 
    scale_color_viridis(discrete=TRUE) +
    theme_new +
    theme(legend.key  = element_rect(fill = "#ffffff")) 

Cat/Cat/Cat

Facet grid

Note: you may have to adjust the height of this plot when you save it or change the theme to something taller.

diamonds %>% 
      ggplot(aes(x = color, fill = color)) +
      geom_bar() +
      facet_grid(clarity ~ cut) +
      labs(title = "Title", 
          subtitle = "Subtitle") + 
      theme_new + theme(legend.position = "right", 
                        legend.direction = "vertical", 
                        legend.key.size = unit(.3, "cm"))

Facet wrap

Again, this is not the best theme for the aspect ratio of this graph but it’s something you should change in ggsave.

diamonds %>% 
      ggplot(aes(x = color, fill = cut)) +
      geom_bar() +
      facet_wrap(~clarity) +
      labs(title = "Title", 
          subtitle = "Subtitle") + 
      theme_new  + 
      theme(legend.position = "right", 
            legend.direction = "vertical", 
            legend.key.size = unit(.3, "cm"))

Cat/Cat/Num

Heatmap

diamonds %>% group_by(cut, color) %>% summarize(price = sum(price)) %>% 
        ggplot(aes(x = cut, y = color, fill = price)) +
        geom_tile()+
        scale_fill_viridis(option =  "B") +
        labs(title = "Title",
             subtitle = "Subtitle",
            y = "", 
            x = "") + 
        theme_new +
        theme(legend.position = "right",
              legend.direction = "vertical", 
              panel.grid.major.y = element_blank(),
              axis.title = element_text())

Heatmap with labels

diamonds %>% group_by(cut, color) %>% summarize(price = sum(price)) %>% 
        ggplot(aes(x = cut, y = color, fill = price)) +
        geom_tile()+
        geom_text(aes(label = price), color = "white") +
        scale_fill_viridis(option =  "B") +
        labs(title = "Title",
             subtitle = "Subtitle",
            y = "", 
            x = "") + 
        theme_new +
        theme(legend.position = "right",
              legend.direction = "vertical", 
              panel.grid.major.y = element_blank(),
              axis.title = element_text())

Num/Num/Cat

Scatterplot w/ accent

(Note the y axis has been changed with ylim. You may have to adjust or delete this setting.)

i1 <- diamonds %>% filter(clarity == "I1")
diamonds %>% 
        ggplot(aes(x = carat, y = price)) + 
        geom_point(color = "#183054", alpha = .1) + 
        geom_point(data = i1, color = "#ffa781", alpha = .2) +
        ylim(0, 20000) +
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "x variable", 
            y = "y variable") +
        theme_new + 
        theme(axis.title = element_text(size = 10, color = "grey40"))

Scatterplot with cat colored

(Note the y axis has been changed with ylim. You may have to adjust or delete this setting.)

diamonds %>% 
        ggplot(aes(x = carat, y = price, color = clarity)) + 
        geom_point(alpha = .1) + 
        ylim(0, 20000) +
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "x variable", 
            y = "y variable") +
        theme_new + 
        theme(axis.title = element_text(size = 10, color = "grey40"), 
              legend.position = "right", 
              legend.direction = "vertical")

Facet with numericals

diamonds %>% ggplot() +
    geom_point(aes(x = table, y = depth, col = cut)) + 
    labs(title = "Comparing table and depth of diamonds",
         x = "Table",
         y = "Depth") +
    theme_new +
    coord_cartesian(xlim = c(45,75), ylim = c(50, 75)) +
    scale_x_continuous(breaks = seq(45,75,5)) +
    scale_y_continuous(breaks = seq(50,75,5)) +
    scale_color_brewer(palette = "Set1", name = "Diamond Cut Quality") +
    facet_grid(. ~ clarity) + 
    theme(legend.key  = element_rect(fill = "#ffffff"))

Line Graph

Custom colors have been included in this graph. These may be adjusted by changing the hexcode. Numerical data needs a one to one relationship.

# Preparing the data
gapminder %>% 
  filter( 
         country %in% c("China", "United States", "Haiti", "Afghanistan", "Myanmar")) %>% 
# Graphing
ggplot(aes(x = year, y = lifeExp, color = country)) + 
        geom_line(size = 1) +
        labs(title = "Title", 
             subtitle = "Subtitle", 
             caption = "Source: ") + 
        theme_new + 
        scale_color_manual(values = c("#f5c242", "#731130", "#139e90", "#296e91", "#ab5107")) + 
        theme(legend.key  = element_rect(fill = "#ffffff"), legend.title = element_blank()) 

Line Graph with points

Custom colors have been included in this graph. These may be adjusted by changing the hexcode. Numerical data needs a one to one relationship.

# Preparing the data
gapminder %>% 
  filter(year >= 1990,
         country %in% c("China", "United States", "Haiti", "Afghanistan", "Myanmar")) %>% 
# Graphing
ggplot(aes(x = year, y = lifeExp, color = country)) + 
        geom_line(size = .5) +
        geom_point() +
        labs(title = "Title", 
             subtitle = "Subtitle", 
             caption = "Source: ") + 
        theme_new + 
        scale_color_manual(values = c("#f5c242", "#731130", "#139e90", "#296e91", "#ab5107")) + 
        theme(legend.key  = element_rect(fill = "#ffffff"), legend.title = element_blank()) 

Stacked Area Chart (non-ordered category)

One to many relationship

# This is not part of the graph in every case 
gapminder %>% 
    group_by(year, continent) %>% 
    summarize(mean_gdp = mean(gdpPercap)) %>% 
# This is part of the graph in every case. 
      ggplot(aes(x = year, y = mean_gdp, fill = continent)) + 
      geom_area() + 
      labs(title = "Title", 
          subtitle = "Subtitle",
          y = "Y Variable",
          x = "") +
      theme_new + theme(axis.title = element_text(size = 10, color = "grey40")) +
      scale_x_continuous(breaks=seq(1952, 2007, 5)) # This changes the labels to show every 5 years starting at 1952 to match the data. 

Stacked Area Chart (for ordinal categories)

One to many relationship with an ordinal category (continent is non-ordered but let’s just pretend it is!)

# This is not part of the graph in every case 
gapminder %>% 
    group_by(year, continent) %>% 
    summarize(mean_gdp = mean(gdpPercap)) %>% 
# This is part of the graph in every case. 
      ggplot(aes(x = year, y = mean_gdp, fill = continent)) + 
      geom_area(colour = "black", size = .2, alpha = .4) +
      scale_fill_brewer(palette = "Blues") +
      labs(title = "Title", 
          subtitle = "Subtitle",
          y = "Y Variable",
          x = "") +
      theme_new + theme(axis.title = element_text(size = 10, color = "grey40")) +
      scale_x_continuous(breaks=seq(1952, 2007, 5)) # This changes the labels to show every 5 years starting at 1952 to match the data. 

Proportional Stacked Area Chart (for ordinal categories)

One to many relationship with an ordinal category (continent is non-ordered but let’s just pretend it is!)

# This is not part of the graph in every case 
gapminder %>% 
    group_by(year, continent) %>% 
    summarize(mean_gdp = mean(gdpPercap)) %>% 
# This is part of the graph in every case. 
      ggplot(aes(x = year, y = mean_gdp, fill = continent)) + 
      geom_area(position = "fill", colour = "black", size = .2, alpha = .4) +
      scale_fill_brewer(palette = "Blues") +
      labs(title = "Title", 
          subtitle = "Subtitle",
          y = "Y Variable",
          x = "") +
      theme_new + theme(axis.title = element_text(size = 10, color = "grey40")) +
      scale_x_continuous(breaks=seq(1952, 2007, 5)) # This changes the labels to show every 5 years starting at 1952 to match the data. 

Num/Num/Num

Heatmap with numerical

I have had a hard time getting this type of graph to look good with other data but I figured I would include it just in case.

faithfuld %>% ggplot() +
  geom_tile(aes(waiting, eruptions, fill = density)) + 
  scale_fill_viridis_c() + 
  labs(title = "Title", 
             caption = "Source: ") + 
  theme_new + 
  theme(legend.position = "right", 
        legend.direction = "vertical")

Interactive (Plotly)

ggplot with plotly

fig <- diamonds %>% 
        ggplot(aes(x = cut, fill = color)) + geom_bar() +
        labs(title = "Title", 
            subtitle = "Subtitle", 
            x = "Variable") +
        theme_new 

font <- list(
            #family = "Open Sans",
            size = 15,
            color = "black"
            )

label <-  list(
            bgcolor = "#FFFFFF",
            bordercolor = "transparent",
            font = font
            )

ggplotly(fig) %>% 
  style(hoverlabel = label) #%>% 
  #config(displayModeBar = FALSE) #Include these lines if you want to remove the configuration bar

3D plots (Num/Num/Num+)

diamonds %>% 
  plot_ly(x = ~carat, y = ~depth, z = ~price, color = ~clarity) %>% 
  add_markers(opacity = .6) %>% 
  layout(title = "\ntitle",
    scene = list(xaxis = list(title = 'carat'),
                     yaxis = list(title = 'depth'),
                     zaxis = list(title = 'price')))

Geospatial

See the geospatial RMD

Forecasting

Turning a dataframe into a tsibble

I am going to comment this out so it doesn’t run, but here’s the code.

# dataset <- read.csv("dataset.csv")

# dataset$date <- as.Date(dataset$date)
# dataset_ts <- dataset %>% 
#                         mutate(date = yearmonth(date)) %>% # You don't always need this line, but it often helps
#                         as_tsibble(key = c(variable), 
#                                 index = date)

Basic Time Series (Line Graph)

# You only need this line if you are getting an error that you have exceeded the quandl anonymous user limit. 
#Quandl.api_key("-kZ-iyK3QPY7erxZcczr")

# Downloading the data
retail_raw <- Quandl('FRED/RSAFSNA',
          type = 'ts',
          start_date = '1990-01-01') 
# Making it into tsibble format
retail <- retail_raw %>% 
  as_tsibble() %>% 
  rename(
    date = index,
    retail_revenue = value
  )

# With autoplot (faster but with less control)
retail %>% 
  autoplot()

# By hand (slower and with more control)
retail %>% 
  ggplot(aes(x = date, y = retail_revenue)) +
  geom_line(color = "#3c5d87") +
  stat_smooth(color = "#ffa781", alpha = .2, size = .7, span = .2) +
  labs(
    title = "Title",
    subtitle = "Subtitle",
    x = "",
    y = "Retail Revenue",
    caption = "Source: U.S. Census Bureau "
  ) + theme_new

STL plot

retail %>% 
  model(
    STL(retail_revenue)
  ) %>% 
  components() %>% 
  autoplot(color = "#3c5d87") +
labs(
    title = "Title",
    subtitle = "Subtitle",
    x = "",
    y = "Retail Revenue",
    caption = "Source: U.S. Census Bureau "
  ) + theme_new

ETS (Forecasting)

retail %>% 
  model(
    ETS(retail_revenue) # This is the variable
  ) %>% 
  forecast(h = 24) %>% 
  autoplot(retail, color = "#3c5d87")+ # This is the dataset
  labs(
    title = "Retail forecast for two years",
    subtitle = "",
    x = "",
    y = "Retail Revenue",
    caption = "Source: U.S. Census Bureau "
  ) + theme_new

Rate of Change

retail_raw %>% # notice this is the data in the format we pulled it from originally.
  ts_pca() %>% 
  as_tsibble() %>% 
  rename(
    date = index,
    retail_revenue = value
  ) %>% 
  ggplot(aes(x = date, y = retail_revenue)) +
  geom_line(color = "#3c5d87") +
  stat_smooth(color = "#ffa781", alpha = .2, size = .7, span = .2) +
  geom_hline(yintercept = 0, color = "white") +
  labs(
    title = "Retail revenue rate of change",
    subtitle = "",
    x = "",
    y = "Change in Retail Revenue over time",
    caption = "Source: U.S. Census Bureau"
  ) +
  theme_new

Plotly

ggplotly()

Dygraphs

retail_raw %>% 
  dygraph(
    main = "US Retail Sales"
  ) %>% 
  dyRangeSelector()

hChart

retail_raw %>%
  hchart()  %>% 
  hc_title(text = "US Retail Sales")

Qualitative (text analysis)

Single Word Analysis

First we need to take our text file and sort out all the individual words and their frequency.

# We will be using the "sentences" dataset that is loaded with tidyverse. Typically instead of these lines you would just read in your dataset.
speaker = rep(c("Sam", "Jane", "Julie", "Whitaker"), 
              length.out = length(sentences))
test_text <- data.frame(sentences) %>% 
  mutate(text = sentences,
    speaker = speaker) %>% 
  select(-sentences) # we are creating an artificial category column so we can do some comparisons. Don't worry about the code up to this point since it's just an example. 

# Make a custom stopwords list that you can change over time 
custom_stop <- c("get", "can", NA)

# Split into single words
single_split <- test_text %>% 
                unnest_tokens(word, text) %>%  # word is new column name (don't change), text is existing text column name
                anti_join(get_stopwords()) %>% 
                filter(!word %in% custom_stop)

# Count them (grouped by a category. If there is no category, just group by word.)
single_count <- single_split %>% 
                  group_by(speaker, word) %>% # You may have only word to group by, or you may have another category to group by. 
                  summarize(freq = n())

# Take a look at single_count to adjust custom stop words if needed

Single Wordclouds

Remember these wordclouds are most effective when you filter the data into groups and compare them.

#compare groups by filtering for a category
df <- single_count %>% filter(speaker == "Sam") %>% ungroup() %>% select(word, freq)

# put custom colors here
colors <- c("#BE4422", "#497BB8", "#FBB861", "#8B99B6")

# Create the wordcloud. Adjust as needed. You may want to open and save it in a browser.
wordcloud2(df, 
           rotateRatio = 0, 
           color = rep_len(colors, nrow(df)), 
           #fontFamily = "Open Sans", 
           background = "#ffffff")

Single Column Chart

# grabs the top most frequent words from each speaker. Change n to change how many are shown.
single_count %>% 
  slice_max(order_by = freq, n = 5, with_ties = FALSE) %>% 
  ggplot(aes(y = fct_reorder(word, freq), x = freq, fill = word)) + 
  geom_col() +
  facet_wrap(~speaker, scales = "free_y")+
  labs(title = "Title", 
       subtitle = "Subtitle") +
  theme_new +
  theme(legend.position = "none")

Bigram Analysis

Bigrams are two words together. You can do bigrams split among a categorical variable like we did for single words, or you can lump them together as we are doing here.

bigram_split <- test_text %>% 
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% # bigram is new column name, text is existing text column name
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word, 
         !word1 %in% custom_stop, 
         !word2 %in% custom_stop)  %>% 
  unite(bigram, word1, word2, sep = " ") 

bigram_count <- bigram_split %>% 
  group_by(bigram) %>% 
  summarize(freq = n())

Bigram Wordclouds

Remember these wordclouds are most effective when you filter the data into groups and compare them. This is the most boring cloud ever because there are so few repeated bigrams in this dataset, but in a more robust text dataset this can be particularly interesting.

#compare groups by filtering for a category
df <- bigram_count 

# put custom colors here
colors <- c("#BE4422", "#497BB8", "#FBB861", "#8B99B6")

# Create the wordcloud. Adjust as needed. You may want to open and save it in a browser.
wordcloud2(df, 
           rotateRatio = 0, 
           color = rep_len(colors, nrow(df)), 
           #fontFamily = "Open Sans", 
           background = "#ffffff")

Bigram Column Chart

bigram_count %>% arrange(desc(freq)) %>% head(n = 10) %>% # here we are just grabbing the top bigrams. 
  ggplot(aes(y = fct_reorder(bigram, freq), x = freq, fill = bigram)) +
  geom_col() +
  labs(title = "Title", 
       subtitle = "Subtitle") +
  theme_new +
  theme(legend.position = "none")

Sentiment Analysis

The sentiment analysis can be done using several different sentiment datasets. Each uses the single_split dataset we created before.

Mood/emotion graph

Splits words into emotions

emotion_sentiment <- single_split %>% 
  inner_join(get_sentiments("nrc")) 

emotion_sentiment_grouped <- emotion_sentiment %>% 
                    filter(speaker %in% c("Sam", "Julie", "Jane")) %>% 
                    group_by(speaker, sentiment) %>% 
                    summarize(count = n())

emotion_sentiment_grouped %>% 
  ggplot(aes(y = fct_reorder(sentiment, count), x = count, fill = sentiment)) + 
  geom_col() + 
  facet_wrap(~speaker) + 
  labs(title = "Title",
       subtitle = "Subtitle") +
  theme_new + 
  theme(legend.position = "none")

Sentiment Weight Graph

Splits words into weighted categories. More positive or more negative means the word has a more positive or negative connotation.

sentiment_weight <- single_split %>% 
  inner_join(get_sentiments("afinn")) 

sentiment_weight %>% 
  ggplot(aes(x = value, fill = value>0)) + 
  geom_histogram(bins = 8) + 
  facet_wrap(~speaker) + 
  labs(title = "Title",
       subtitle = "Subtitle") +
  theme_new + 
  theme(legend.position = "none")

Sentiment Binary

Splits words into just positive/negative

sentiment_binary <- single_split %>% 
  inner_join(get_sentiments("bing"))  %>% 
  group_by(speaker, sentiment) %>% 
  summarize(count = n())

# Note: if categories have large disparities in how many words each has entirely, a better plot may be "position = 'fill'" so you can see the proportion and more easily compare them. 
sentiment_binary %>% 
  ggplot(aes(x = speaker, y = count, fill = sentiment)) + 
  geom_col(position = "dodge") + 
  labs(title = "Title", subtitle = "Subtitle") + 
  theme_new

Sentence Sentiment Average

Splits text into sentences, then computes the average sentiment for each. Neutral sentences included.

This code does not take into consideration categories. If you would like to compare different speakers for example, you need to make a subset of your dataframe and then run this code separately for each speaker subset.

sentence_breakdown <- get_sentences(test_text[]) 

sentiment <- sentiment(sentence_breakdown) %>% 
  # You shouldn't have to adjust these lines at all. 
  mutate(sentiment_num = sentiment, 
         sentiment = ifelse(sentiment == 0, "neutral", 
                            ifelse(sentiment > 0, "positive", "negative")))

colors <- c("#f4927c", "#b3b4b5", "#6397ce")

sentiment %>% ggplot(aes(x = sentiment_num, fill = sentiment)) + 
  geom_histogram() + 
  labs(title = "Title", 
       subtitle = "Subtitle") + 
  theme_new +
  scale_fill_manual(values = colors)